library(tidyverse)
library(rvest)
lr_data_url <- "https://en.wikipedia.org/wiki/Logistic_regression"
lr_data_raw <- read_html(x = lr_data_url) %>%
html_elements(".wikitable") %>%
html_table() %>%
.[[1]]
lr_tbl <- lr_data_raw %>%
janitor::clean_names() %>%
pivot_longer(-x1) %>%
mutate(구분 = ifelse(str_detect(x1, "Hours"), "학습시간", "입학여부")) %>%
select(name, 구분, value) %>%
pivot_wider(names_from = 구분, values_from = value) %>%
select(학습시간, 입학여부)
# fs::dir_create("data")
lr_tbl %>%
write_rds("data/lr_tbl.rds")lr_tbl <-read_rds("data/lr_tbl.rds")
lr_tbl %>%
reactable::reactable()lr_tbl %>%
skimr::skim()| Name | Piped data |
| Number of rows | 20 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| 학습시간 | 0 | 1 | 2.79 | 1.51 | 0.5 | 1.69 | 2.62 | 4.06 | 5.5 | ▇▇▆▅▅ |
| 입학여부 | 0 | 1 | 0.50 | 0.51 | 0.0 | 0.00 | 0.50 | 1.00 | 1.0 | ▇▁▁▁▇ |
lr_tbl %>%
ggplot(aes(x = 학습시간, y = 입학여부)) +
geom_point() +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
se = FALSE) +
labs(title = "학습시간과 입학확률",
x = "학습시간",
y = "합격확률 (%)") +
theme_light() +
scale_y_continuous(labels = scales::percent)adm_lr <- glm(입학여부 ~ 학습시간, family = "binomial", data = lr_tbl)
adm_lr
Call: glm(formula = 입학여부 ~ 학습시간, family = "binomial",
data = lr_tbl)
Coefficients:
(Intercept) 학습시간
-4.078 1.505
Degrees of Freedom: 19 Total (i.e. Null); 18 Residual
Null Deviance: 27.73
Residual Deviance: 16.06 AIC: 20.06
library(crosstalk)
crosstalk_lr_raw <- tibble( 학습시간 = seq(from = 1, to = 5, 0.1) )
crosstalk_lr_tbl <- crosstalk_lr_raw %>%
mutate(합격확률 = predict(adm_lr, newdata = crosstalk_lr_raw, type = "response" )) %>%
left_join(lr_tbl) %>%
mutate(입학여부 = factor(입학여부, levels = c(0, 1), labels = c("불합격", "합격")) )
crosstalk_lr_g <- crosstalk_lr_tbl %>%
ggplot(aes(x = 학습시간, y = 합격확률) ) +
geom_point() +
geom_point(aes(x = 학습시간, y = as.numeric(입학여부) - 1, color = 입학여부 ) ) +
geom_smooth(method = "glm", method.args = list(family = "binomial"),
se = FALSE) +
labs(title = "학습시간과 입학확률",
x = "학습시간",
y = "합격확률 (%)") +
theme_light() +
scale_y_continuous(labels = scales::percent)
plotly::ggplotly(crosstalk_lr_g )최우추정량(MLE)을 찾는 것은 - 우도(Likelihood)값을 구하는 것과 동일하기 General-purpose optimization 에 함수를 정의해서 모수 초기화하여 함께 넣어 반복적으로 근사시켜 모수를 계산한다.
\[ NLL(y) = -{\log(p(y))} \]
\[ \min_{\theta} \sum_y {-\log(p(y;\theta))} \]
\[ \max_{\theta} \prod_y p(y;\theta) \]
sigmoid_fn <- function(x){
1/(1+exp(-x))
}
neg_log_likelihood <- function(par, data, y, include_alpha = T) {
# 출력변수 정의
x <- data[,names(data) != y]
y_data <- data[,y]
# 1. 선형결합
if( include_alpha ){
# 선형결합: beta_1 * x_1 + beta_2 * x_2 + ...
linear_combination <- mapply("*", x, par[2:length(par)])
# 알파(편향) 계수 결합 : alpha + beta_1 * x_1 + beta_2 * x_2 + ...
theta <- rowSums(linear_combination) + par[1]
} else {
theta <- rowSums(mapply("*", x, par))
}
# 2. 확률 계산
p <- sigmoid_fn(theta)
# p <- exp(theta) / (1 + exp(theta))
# 3. 우도값 계산: -log likelihood
value <- - sum(y_data * log(p) + (1-y_data)*log(1-p))
return(value)
}
library(optimx)
lr_opt <- optimx(
par = c(0,0),
fn = neg_log_likelihood,
data = lr_tbl,
y = "입학여부",
method = "Nelder-Mead",
include_alpha = TRUE,
itnmax = 100,
control = list(trace = TRUE, all.methods = FALSE)
)fn is fn
Looking for method = Nelder-Mead
Function has 2 arguments
Analytic gradient not made available.
Analytic Hessian not made available.
Scale check -- log parameter ratio= -Inf log bounds ratio= NA
Method: Nelder-Mead
Nelder-Mead direct search function minimizer
function value for initial parameters = 13.862944
Scaled convergence tolerance is 2.06574e-07
Stepsize computed as 0.100000
BUILD 3 13.887933 13.096825
EXTENSION 5 13.862944 12.582216
EXTENSION 7 13.096825 12.067907
EXTENSION 9 12.582216 11.285614
LO-REDUCTION 11 12.067907 11.285614
LO-REDUCTION 13 11.874408 11.285614
EXTENSION 15 11.469089 10.348151
LO-REDUCTION 17 11.285614 10.348151
EXTENSION 19 10.370274 8.914547
LO-REDUCTION 21 10.348151 8.914547
EXTENSION 23 9.266657 8.226456
REFLECTION 25 8.914547 8.030679
LO-REDUCTION 27 8.259889 8.030679
LO-REDUCTION 29 8.226456 8.030679
LO-REDUCTION 31 8.114076 8.030679
HI-REDUCTION 33 8.053435 8.030679
HI-REDUCTION 35 8.052924 8.030679
LO-REDUCTION 37 8.038012 8.030679
HI-REDUCTION 39 8.033349 8.030679
LO-REDUCTION 41 8.031439 8.030144
HI-REDUCTION 43 8.030679 8.030087
HI-REDUCTION 45 8.030144 8.029950
HI-REDUCTION 47 8.030087 8.029938
HI-REDUCTION 49 8.029950 8.029917
HI-REDUCTION 51 8.029938 8.029881
LO-REDUCTION 53 8.029917 8.029881
HI-REDUCTION 55 8.029890 8.029881
LO-REDUCTION 57 8.029889 8.029880
HI-REDUCTION 59 8.029881 8.029880
HI-REDUCTION 61 8.029880 8.029879
HI-REDUCTION 63 8.029880 8.029879
REFLECTION 65 8.029879 8.029879
Exiting from Nelder Mead minimizer
67 function evaluations used
Post processing for method Nelder-Mead
Successful convergence!
Compute Hessian approximation at finish of Nelder-Mead
Compute gradient approximation at finish of Nelder-Mead
Save results from method Nelder-Mead
$par
[1] -4.076953 1.504453
$value
[1] 8.029879
$message
NULL
$convcode
[1] 0
$fevals
function
67
$gevals
gradient
NA
$nitns
[1] NA
$kkt1
[1] TRUE
$kkt2
[1] TRUE
$xtimes
user.self
0.08
Assemble the answers
lr_opt[, 1:5] %>%
as.data.frame() %>%
rownames_to_column("method") %>%
filter(method == "Nelder-Mead") %>%
select(method, p1, p2) method p1 p2
1 Nelder-Mead -4.076953 1.504453
glm() 함수로 구현한 것과 값이 동일한지 상호확인한다.
adm_lr$coefficients(Intercept) 학습시간
-4.077713 1.504645
predict_fn <- function(x, par){
if( ncol(x) < length(par) ){
theta <- rowSums(mapply("*", x, par[2:length(par)])) + as.numeric(par[1])
} else {
theta <- rowSums(mapply("*", x, par))
}
prob <- sigmoid_fn(theta)
return(prob)
}
custom_pred <- predict_fn(lr_tbl %>% select(학습시간), lr_opt[, 1:2])
lr_tbl %>%
mutate(자체제작_예측 = custom_pred) %>%
mutate(예측 = predict(adm_lr, newdata = lr_tbl %>% select(학습시간), type ="response")) %>%
reactable::reactable()library(nnet)
library(NeuralNetTools)
lr_nn_tbl <- lr_tbl %>%
rename(adm_yn = 입학여부,
learning_hour = 학습시간)
adm_nn <- nnet(adm_yn ~ learning_hour,
size = 2,
softmax = FALSE,
data = lr_nn_tbl)# weights: 7
initial value 5.508862
iter 10 value 2.774533
iter 20 value 2.721744
iter 30 value 2.601059
iter 40 value 2.486752
iter 50 value 2.254115
iter 60 value 2.226825
iter 70 value 2.226668
iter 80 value 2.226582
iter 90 value 2.226488
iter 100 value 2.226445
final value 2.226445
stopped after 100 iterations
devtools::source_url('https://gist.githubusercontent.com/Peque/41a9e20d6687f2f3108d/raw/85e14f3a292e126f1454864427e3a189c2fe33f3/nnet_plot_update.r')
plotnet(adm_nn)데이터 과학자 이광춘 저작
kwangchun.lee.7@gmail.com